library(ggplot2)
library(arules)
library(arulesViz)
library(dplyr)
library(plotly)
### Your working directory will be different.
setwd("/Users/aoliv01/Desktop/GradSchool/2018-2/DataMining/Homework/HW3")
getwd()
[1] "/Users/aoliv01/Desktop/GradSchool/2018-2/DataMining/Homework/HW3"
### Reading in the data
raw <- read.csv("bankdata_csv_all.csv")
str(raw)
'data.frame':   600 obs. of  12 variables:
 $ id         : Factor w/ 600 levels "ID12101","ID12102",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ age        : int  48 40 51 23 57 57 22 58 37 54 ...
 $ sex        : Factor w/ 2 levels "FEMALE","MALE": 1 2 1 1 1 1 2 2 1 2 ...
 $ region     : Factor w/ 4 levels "INNER_CITY","RURAL",..: 1 4 1 4 2 4 2 4 3 4 ...
 $ income     : num  17546 30085 16575 20375 50576 ...
 $ married    : Factor w/ 2 levels "NO","YES": 1 2 2 2 2 2 1 2 2 2 ...
 $ children   : int  1 3 0 3 0 2 0 0 2 2 ...
 $ car        : Factor w/ 2 levels "NO","YES": 1 2 2 1 1 1 1 2 2 2 ...
 $ save_act   : Factor w/ 2 levels "NO","YES": 1 1 2 1 2 2 1 2 1 2 ...
 $ current_act: Factor w/ 2 levels "NO","YES": 1 2 2 2 1 2 2 2 1 2 ...
 $ mortgage   : Factor w/ 2 levels "NO","YES": 1 2 1 1 1 1 1 1 1 1 ...
 $ pep        : Factor w/ 2 levels "NO","YES": 2 1 1 1 1 2 2 1 1 1 ...
attach(raw)
The following objects are masked from raw (pos = 3):

    age, car, children, current_act, id, income, married, mortgage, pep, region,
    save_act, sex

The following objects are masked from raw (pos = 6):

    age, car, children, current_act, id, income, married, mortgage, pep, region,
    save_act, sex
## Checking for null records
total_na <- sum(is.na(raw))
cat("Total na is: ", total_na)
Total na is:  0
## Removing the id column
raw <- raw[,-1]
## Quick boxplot to check for income outliers
income_box <- ggplot(raw, aes(x = pep, y = income)) + 
  geom_boxplot(fill = "steelblue", color = "darkorange") +
  theme(plot.title = element_text(hjust = 0.5, color = "darkorange"))
income_box

## Box plot to check for age outliers or odd values
age_box <- ggplot(raw, aes(x = pep, y = age)) + 
  geom_boxplot(fill = "steelblue", color = "darkorange") +
  theme(plot.title = element_text(hjust = 0.5, color = "darkorange"))
age_box

## Chart showing income and age of PEP customers and not PEP customers
ggplot(data = raw, aes(x = income, y = age)) + 
  geom_point(colour = "steelblue", alpha = .5) + 
  geom_smooth(method = "lm") +
  facet_wrap(~ pep) + 
  ggtitle("PEP by Age and Income")

## Checking to see that age and income for PEP and non-PEP are comparable
## This will also show outliers
aggregate(income ~ pep, data = raw, median)
aggregate(age ~ pep, data = raw, median)
aggregate(income ~ pep, data = raw, max)
aggregate(age ~ pep, data = raw, max)
aggregate(income ~ pep, data = raw, min)
aggregate(age ~ pep, data = raw, min)
## Discretizing record data into transactional data
change <- c("id", "age", "income", "children")
disc <- raw[ , !(names(raw) %in% change)]
disc$age <- discretize(
  raw$age,
  method = "cluster",
  labels = c("young", "middle", "older")
  )
disc$income <- discretize(
    raw$income,
    method = "cluster",
    labels = c("low", "medium", "high")
  )
disc$children <- discretize(
    raw$children,
    method = "cluster",
    labels = c("low", "medium", "many")
  )
detach(raw)
summary(disc)
     sex             region    married    car      save_act  current_act mortgage   pep     
 FEMALE:300   INNER_CITY:269   NO :204   NO :304   NO :186   NO :145     NO :391   NO :326  
 MALE  :300   RURAL     : 96   YES:396   YES:296   YES:414   YES:455     YES:209   YES:274  
              SUBURBAN  : 62                                                                
              TOWN      :173                                                                
     age         income      children  
 young :183   low   :273   low   :398  
 middle:226   medium:221   medium:134  
 older :191   high  :106   many  : 68  
                                       
## Running apriori algorithm
rules <- apriori(disc, parameter = list(supp = 0.025, conf = 0.75, maxlen = 4))
Apriori

Parameter specification:
 confidence minval smax arem  aval originalSupport maxtime support minlen maxlen target   ext
       0.75    0.1    1 none FALSE            TRUE       5   0.025      1      4  rules FALSE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

Absolute minimum support count: 15 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[27 item(s), 600 transaction(s)] done [0.00s].
sorting and recoding items ... [27 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4
Mining stopped (maxlen reached). Only patterns up to a length of 4 returned!
 done [0.00s].
writing ... [2293 rule(s)] done [0.00s].
creating S4 object  ... done [0.00s].
rules<-sort(rules, by="confidence", decreasing=TRUE)
options(digits=2)
inspect(head(rules, 20))
     lhs                                rhs            support confidence lift count
[1]  {income=high}                   => {save_act=YES} 0.177   1          1.4  106  
[2]  {save_act=NO,children=many}     => {pep=NO}       0.037   1          1.8   22  
[3]  {region=RURAL,income=high}      => {save_act=YES} 0.045   1          1.4   27  
[4]  {income=high,children=medium}   => {save_act=YES} 0.047   1          1.4   28  
[5]  {current_act=NO,income=high}    => {save_act=YES} 0.035   1          1.4   21  
[6]  {region=TOWN,income=high}       => {save_act=YES} 0.038   1          1.4   23  
[7]  {age=older,income=high}         => {save_act=YES} 0.150   1          1.4   90  
[8]  {married=NO,income=high}        => {save_act=YES} 0.057   1          1.4   34  
[9]  {mortgage=YES,income=high}      => {save_act=YES} 0.057   1          1.4   34  
[10] {age=middle,income=high}        => {save_act=YES} 0.027   1          1.4   16  
[11] {region=INNER_CITY,income=high} => {save_act=YES} 0.073   1          1.4   44  
[12] {pep=YES,income=high}           => {save_act=YES} 0.105   1          1.4   63  
[13] {car=YES,income=high}           => {save_act=YES} 0.090   1          1.4   54  
[14] {sex=MALE,income=high}          => {save_act=YES} 0.088   1          1.4   53  
[15] {sex=FEMALE,income=high}        => {save_act=YES} 0.088   1          1.4   53  
[16] {car=NO,income=high}            => {save_act=YES} 0.087   1          1.4   52  
[17] {pep=NO,income=high}            => {save_act=YES} 0.072   1          1.4   43  
[18] {mortgage=NO,income=high}       => {save_act=YES} 0.120   1          1.4   72  
[19] {married=YES,income=high}       => {save_act=YES} 0.120   1          1.4   72  
[20] {income=high,children=low}      => {save_act=YES} 0.110   1          1.4   66  
# Finding the highest ranked rules by support and confidence
# Writing a csv file of the rules
highest <- data.frame(
       lhs = labels(lhs(rules)),
       rhs = labels(rhs(rules)),
      rules@quality)
highest <- highest[which(highest$support > mean(highest$support) & highest$confidence > mean(highest$confidence) & highest$lift > 1),]
highest <- highest[order(highest$confidence, highest$support),]
highest <- highest[1:25,]
write.csv(highest, file = "BankRuleSets.csv")
## The top rules for rhs = PEP
inspect(subset(rules, subset = support > mean(support) & confidence > mean(confidence) & lift > mean(lift) &  rhs %pin% "pep=YES" ))
    lhs                                      rhs       support confidence lift count
[1] {married=NO,mortgage=NO,children=low} => {pep=YES} 0.12    0.92       2    70   
## All rules for rhs = PEP
inspect(subset(rules, subset = rhs %pin% "pep=YES" ))
     lhs                                              rhs       support confidence lift count
[1]  {married=NO,mortgage=NO,income=high}          => {pep=YES} 0.040   0.96       2.1  24   
[2]  {married=NO,mortgage=NO,children=low}         => {pep=YES} 0.117   0.92       2.0  70   
[3]  {age=older,income=high,children=medium}       => {pep=YES} 0.037   0.92       2.0  22   
[4]  {income=high,children=medium}                 => {pep=YES} 0.042   0.89       2.0  25   
[5]  {save_act=YES,income=high,children=medium}    => {pep=YES} 0.042   0.89       2.0  25   
[6]  {mortgage=NO,income=high,children=medium}     => {pep=YES} 0.027   0.89       1.9  16   
[7]  {married=NO,save_act=NO,children=low}         => {pep=YES} 0.065   0.89       1.9  39   
[8]  {sex=MALE,income=high,children=medium}        => {pep=YES} 0.025   0.88       1.9  15   
[9]  {sex=MALE,married=NO,income=high}             => {pep=YES} 0.025   0.88       1.9  15   
[10] {current_act=YES,income=high,children=medium} => {pep=YES} 0.035   0.88       1.9  21   
[11] {married=NO,mortgage=NO,age=older}            => {pep=YES} 0.067   0.87       1.9  40   
[12] {save_act=NO,mortgage=YES,children=low}       => {pep=YES} 0.068   0.85       1.9  41   
[13] {sex=MALE,married=NO,age=older}               => {pep=YES} 0.028   0.85       1.9  17   
[14] {married=NO,save_act=NO,age=older}            => {pep=YES} 0.025   0.83       1.8  15   
[15] {married=NO,income=high}                      => {pep=YES} 0.047   0.82       1.8  28   
[16] {married=NO,save_act=YES,income=high}         => {pep=YES} 0.047   0.82       1.8  28   
[17] {married=NO,current_act=YES,income=high}      => {pep=YES} 0.038   0.82       1.8  23   
[18] {married=NO,age=older,income=high}            => {pep=YES} 0.037   0.81       1.8  22   
[19] {married=NO,age=older,children=low}           => {pep=YES} 0.052   0.79       1.7  31   
[20] {region=INNER_CITY,married=NO,age=older}      => {pep=YES} 0.045   0.79       1.7  27   
[21] {married=NO,car=NO,age=older}                 => {pep=YES} 0.038   0.79       1.7  23   
[22] {region=INNER_CITY,married=NO,children=low}   => {pep=YES} 0.080   0.79       1.7  48   
[23] {married=NO,current_act=NO,children=low}      => {pep=YES} 0.035   0.78       1.7  21   
[24] {married=NO,mortgage=NO,income=medium}        => {pep=YES} 0.068   0.77       1.7  41   
[25] {sex=MALE,age=older,children=medium}          => {pep=YES} 0.028   0.77       1.7  17   
[26] {save_act=NO,age=older,children=low}          => {pep=YES} 0.033   0.77       1.7  20   
[27] {married=NO,income=medium,children=low}       => {pep=YES} 0.072   0.77       1.7  43   
[28] {car=YES,mortgage=NO,income=high}             => {pep=YES} 0.045   0.75       1.6  27   
## Formatting a new dataset to plot
ruledf = data.frame(
       lhs = labels(lhs(rules)),
       rhs = labels(rhs(rules)), 
       rules@quality)
ruledf$pep <- ifelse(ruledf$rhs == "{pep=YES}", 'YesPep', 'NoPep')
## Plotting the new dataset in an interactive 3D plot
p <- plot_ly(ruledf,
              x = ~confidence,
              y = ~lift,
              z = ~support,
              color = ~ruledf$pep,
              colors = c('steelblue', 'darkorange'),
              marker = list(size = 4, opacity = 0.35)
             ) %>% 
    add_markers() %>%
    layout(scene = list(
                   xaxis = list(title = 'support'),
                   yaxis = list(title = 'confidence'),
                   zaxis = list(title = 'lift')
                   ))
p
LS0tCnRpdGxlOiAiSG9tZXdvcmsgMyAtIE9saXZpZXJpIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCmBgYHtyfQpsaWJyYXJ5KGdncGxvdDIpCmxpYnJhcnkoYXJ1bGVzKQpsaWJyYXJ5KGFydWxlc1ZpeikKbGlicmFyeShkcGx5cikKbGlicmFyeShwbG90bHkpCmBgYAoKYGBge3J9CiMjIyBZb3VyIHdvcmtpbmcgZGlyZWN0b3J5IHdpbGwgYmUgZGlmZmVyZW50LgpzZXR3ZCgiL1VzZXJzL2FvbGl2MDEvRGVza3RvcC9HcmFkU2Nob29sLzIwMTgtMi9EYXRhTWluaW5nL0hvbWV3b3JrL0hXMyIpCmdldHdkKCkKYGBgCgpgYGB7cn0KIyMjIFJlYWRpbmcgaW4gdGhlIGRhdGEKcmF3IDwtIHJlYWQuY3N2KCJiYW5rZGF0YV9jc3ZfYWxsLmNzdiIpCnN0cihyYXcpCmBgYApgYGB7cn0KYXR0YWNoKHJhdykKYGBgCgpgYGB7cn0KIyMgQ2hlY2tpbmcgZm9yIG51bGwgcmVjb3Jkcwp0b3RhbF9uYSA8LSBzdW0oaXMubmEocmF3KSkKY2F0KCJUb3RhbCBuYSBpczogIiwgdG90YWxfbmEpCmBgYAoKYGBge3J9CiMjIFJlbW92aW5nIHRoZSBpZCBjb2x1bW4KcmF3IDwtIHJhd1ssLTFdCmBgYAoKYGBge3J9CiMjIFF1aWNrIGJveHBsb3QgdG8gY2hlY2sgZm9yIGluY29tZSBvdXRsaWVycwppbmNvbWVfYm94IDwtIGdncGxvdChyYXcsIGFlcyh4ID0gcGVwLCB5ID0gaW5jb21lKSkgKyAKICBnZW9tX2JveHBsb3QoZmlsbCA9ICJzdGVlbGJsdWUiLCBjb2xvciA9ICJkYXJrb3JhbmdlIikgKwogIHRoZW1lKHBsb3QudGl0bGUgPSBlbGVtZW50X3RleHQoaGp1c3QgPSAwLjUsIGNvbG9yID0gImRhcmtvcmFuZ2UiKSkKCmluY29tZV9ib3gKYGBgCgpgYGB7cn0KIyMgQm94IHBsb3QgdG8gY2hlY2sgZm9yIGFnZSBvdXRsaWVycyBvciBvZGQgdmFsdWVzCmFnZV9ib3ggPC0gZ2dwbG90KHJhdywgYWVzKHggPSBwZXAsIHkgPSBhZ2UpKSArIAogIGdlb21fYm94cGxvdChmaWxsID0gInN0ZWVsYmx1ZSIsIGNvbG9yID0gImRhcmtvcmFuZ2UiKSArCiAgdGhlbWUocGxvdC50aXRsZSA9IGVsZW1lbnRfdGV4dChoanVzdCA9IDAuNSwgY29sb3IgPSAiZGFya29yYW5nZSIpKQphZ2VfYm94CmBgYAoKYGBge3J9CiMjIENoYXJ0IHNob3dpbmcgaW5jb21lIGFuZCBhZ2Ugb2YgUEVQIGN1c3RvbWVycyBhbmQgbm90IFBFUCBjdXN0b21lcnMKZ2dwbG90KGRhdGEgPSByYXcsIGFlcyh4ID0gaW5jb21lLCB5ID0gYWdlKSkgKyAKICBnZW9tX3BvaW50KGNvbG91ciA9ICJzdGVlbGJsdWUiLCBhbHBoYSA9IC41KSArIAogIGdlb21fc21vb3RoKG1ldGhvZCA9ICJsbSIpICsKICBmYWNldF93cmFwKH4gcGVwKSArIAogIGdndGl0bGUoIlBFUCBieSBBZ2UgYW5kIEluY29tZSIpCmBgYAoKYGBge3J9CiMjIENoZWNraW5nIHRvIHNlZSB0aGF0IGFnZSBhbmQgaW5jb21lIGZvciBQRVAgYW5kIG5vbi1QRVAgYXJlIGNvbXBhcmFibGUKIyMgVGhpcyB3aWxsIGFsc28gc2hvdyBvdXRsaWVycwphZ2dyZWdhdGUoaW5jb21lIH4gcGVwLCBkYXRhID0gcmF3LCBtZWRpYW4pCmFnZ3JlZ2F0ZShhZ2UgfiBwZXAsIGRhdGEgPSByYXcsIG1lZGlhbikKYWdncmVnYXRlKGluY29tZSB+IHBlcCwgZGF0YSA9IHJhdywgbWF4KQphZ2dyZWdhdGUoYWdlIH4gcGVwLCBkYXRhID0gcmF3LCBtYXgpCmFnZ3JlZ2F0ZShpbmNvbWUgfiBwZXAsIGRhdGEgPSByYXcsIG1pbikKYWdncmVnYXRlKGFnZSB+IHBlcCwgZGF0YSA9IHJhdywgbWluKQpgYGAKCgpgYGB7cn0KIyMgRGlzY3JldGl6aW5nIHJlY29yZCBkYXRhIGludG8gdHJhbnNhY3Rpb25hbCBkYXRhCmNoYW5nZSA8LSBjKCJpZCIsICJhZ2UiLCAiaW5jb21lIiwgImNoaWxkcmVuIikKZGlzYyA8LSByYXdbICwgIShuYW1lcyhyYXcpICVpbiUgY2hhbmdlKV0KCmRpc2MkYWdlIDwtIGRpc2NyZXRpemUoCiAgcmF3JGFnZSwKICBtZXRob2QgPSAiY2x1c3RlciIsCiAgbGFiZWxzID0gYygieW91bmciLCAibWlkZGxlIiwgIm9sZGVyIikKICApCmRpc2MkaW5jb21lIDwtIGRpc2NyZXRpemUoCiAgICByYXckaW5jb21lLAogICAgbWV0aG9kID0gImNsdXN0ZXIiLAogICAgbGFiZWxzID0gYygibG93IiwgIm1lZGl1bSIsICJoaWdoIikKICApCmRpc2MkY2hpbGRyZW4gPC0gZGlzY3JldGl6ZSgKICAgIHJhdyRjaGlsZHJlbiwKICAgIG1ldGhvZCA9ICJjbHVzdGVyIiwKICAgIGxhYmVscyA9IGMoImxvdyIsICJtZWRpdW0iLCAibWFueSIpCiAgKQpkZXRhY2gocmF3KQpzdW1tYXJ5KGRpc2MpCmBgYAoKCmBgYHtyfQojIyBSdW5uaW5nIGFwcmlvcmkgYWxnb3JpdGhtCnJ1bGVzIDwtIGFwcmlvcmkoZGlzYywgcGFyYW1ldGVyID0gbGlzdChzdXBwID0gMC4wMjUsIGNvbmYgPSAwLjc1LCBtYXhsZW4gPSA0KSkKcnVsZXM8LXNvcnQocnVsZXMsIGJ5PSJjb25maWRlbmNlIiwgZGVjcmVhc2luZz1UUlVFKQpvcHRpb25zKGRpZ2l0cz0yKQppbnNwZWN0KGhlYWQocnVsZXMsIDIwKSkKYGBgCgpgYGB7cn0KIyBGaW5kaW5nIHRoZSBoaWdoZXN0IHJhbmtlZCBydWxlcyBieSBzdXBwb3J0IGFuZCBjb25maWRlbmNlCiMgV3JpdGluZyBhIGNzdiBmaWxlIG9mIHRoZSBydWxlcwpoaWdoZXN0IDwtIGRhdGEuZnJhbWUoCiAgICAgICBsaHMgPSBsYWJlbHMobGhzKHJ1bGVzKSksCiAgICAgICByaHMgPSBsYWJlbHMocmhzKHJ1bGVzKSksCiAgICAgIHJ1bGVzQHF1YWxpdHkpCmhpZ2hlc3QgPC0gaGlnaGVzdFt3aGljaChoaWdoZXN0JHN1cHBvcnQgPiBtZWFuKGhpZ2hlc3Qkc3VwcG9ydCkgJiBoaWdoZXN0JGNvbmZpZGVuY2UgPiBtZWFuKGhpZ2hlc3QkY29uZmlkZW5jZSkgJiBoaWdoZXN0JGxpZnQgPiAxKSxdCmhpZ2hlc3QgPC0gaGlnaGVzdFtvcmRlcihoaWdoZXN0JGNvbmZpZGVuY2UsIGhpZ2hlc3Qkc3VwcG9ydCksXQpoaWdoZXN0IDwtIGhpZ2hlc3RbMToyNSxdCndyaXRlLmNzdihoaWdoZXN0LCBmaWxlID0gIkJhbmtSdWxlU2V0cy5jc3YiKQpgYGAKCgpgYGB7cn0KIyMgVGhlIHRvcCBydWxlcyBmb3IgcmhzID0gUEVQCmluc3BlY3Qoc3Vic2V0KHJ1bGVzLCBzdWJzZXQgPSBzdXBwb3J0ID4gbWVhbihzdXBwb3J0KSAmIGNvbmZpZGVuY2UgPiBtZWFuKGNvbmZpZGVuY2UpICYgbGlmdCA+IG1lYW4obGlmdCkgJiAgcmhzICVwaW4lICJwZXA9WUVTIiApKQpgYGAKCmBgYHtyfQojIyBBbGwgcnVsZXMgZm9yIHJocyA9IFBFUAppbnNwZWN0KHN1YnNldChydWxlcywgc3Vic2V0ID0gcmhzICVwaW4lICJwZXA9WUVTIiApKQpgYGAKCmBgYHtyfQojIyBGb3JtYXR0aW5nIGEgbmV3IGRhdGFzZXQgdG8gcGxvdApydWxlZGYgPSBkYXRhLmZyYW1lKAogICAgICAgbGhzID0gbGFiZWxzKGxocyhydWxlcykpLAogICAgICAgcmhzID0gbGFiZWxzKHJocyhydWxlcykpLCAKICAgICAgIHJ1bGVzQHF1YWxpdHkpCnJ1bGVkZiRwZXAgPC0gaWZlbHNlKHJ1bGVkZiRyaHMgPT0gIntwZXA9WUVTfSIsICdZZXNQZXAnLCAnTm9QZXAnKQoKIyMgUGxvdHRpbmcgdGhlIG5ldyBkYXRhc2V0IGluIGFuIGludGVyYWN0aXZlIDNEIHBsb3QKcCA8LSBwbG90X2x5KHJ1bGVkZiwKICAgICAgICAgICAgICB4ID0gfmNvbmZpZGVuY2UsCiAgICAgICAgICAgICAgeSA9IH5saWZ0LAogICAgICAgICAgICAgIHogPSB+c3VwcG9ydCwKICAgICAgICAgICAgICBjb2xvciA9IH5ydWxlZGYkcGVwLAogICAgICAgICAgICAgIGNvbG9ycyA9IGMoJ3N0ZWVsYmx1ZScsICdkYXJrb3JhbmdlJyksCiAgICAgICAgICAgICAgbWFya2VyID0gbGlzdChzaXplID0gNCwgb3BhY2l0eSA9IDAuMzUpCiAgICAgICAgICAgICApICU+JSAKICAgIGFkZF9tYXJrZXJzKCkgJT4lCiAgICBsYXlvdXQoc2NlbmUgPSBsaXN0KAogICAgICAgICAgICAgICAgICAgeGF4aXMgPSBsaXN0KHRpdGxlID0gJ3N1cHBvcnQnKSwKICAgICAgICAgICAgICAgICAgIHlheGlzID0gbGlzdCh0aXRsZSA9ICdjb25maWRlbmNlJyksCiAgICAgICAgICAgICAgICAgICB6YXhpcyA9IGxpc3QodGl0bGUgPSAnbGlmdCcpCiAgICAgICAgICAgICAgICAgICApKQpwCmBgYAoKCg==